##################################################################

# Gautam Nair
# gautam.nair@yale.edu
# Misperceptions of Relative Affluence and Support for International Transfers
# Make Figure 4: Misperceptions of Rank and Median among College Subset

##################################################################

# setwd("")

##################################################################
# loading packages
##################################################################

library(Hmisc)
library(foreign)
library(sandwich)
library(lmtest)
library(numDeriv)
library(stargazer)
library(ggplot2)
library(plyr)
library(gridExtra)
library(ri)
library(dplyr)
library(plyr)
library(scales)

##################################################################

rm(list=ls())
data.working <- readRDS("d_r_cleaned_analysis_dataset.Rda")
data.working <- data.working[data.working$educat.ba.plus==1,]

####################################################################
# Summary Statistics of Estimated Quantities
####################################################################

# estimated median
table(data.working$q3b)
quantile(data.working$q3b, na.rm=TRUE)

# actual rank
table(data.working$rank.global.true)
quantile(data.working$rank.global.true, na.rm=TRUE)

# estimated rank
table(data.working$q3a)
quantile(data.working$q3a, na.rm=TRUE)

# bias (actual - estimated rank)
table(data.working$rank.estimate.diff)
quantile(data.working$rank.estimate.diff, na.rm=TRUE)

####################################################################
# ECDF of estimated median income
####################################################################

myvars <- c("q3a", "q3b", "income.new", "rank.global.true", "rank.estimate.categorical", "rank.estimate.diff")
data.subset <- data.working[myvars]
data.subset$median.estimate <- data.subset$q3b
data.subset$median.estimate[data.subset$median.estimate>=75000]<-75000

plot.ecdf <- ggplot(data.subset, aes(x=median.estimate)) + stat_ecdf(colour="royalblue4", size=0.8) + scale_x_continuous(breaks=c(0, 10000,20000, 30000,40000, 50000, 60000), labels=c("0", "10", "20","30" ,"40", "50", "60"), limits=c(0, 70000)) + scale_y_continuous(breaks=c(0,.20, 0.40,0.60, 0.80, 1.00)) + xlab("Estimate of Global Median Income in '000 $") + ylab("Cumulative Probability") + theme(plot.title = element_text(size = 11), aspect.ratio=1, axis.text=element_text(size=11),
        axis.title=element_text(size=11),plot.title = element_text(size = 8)#,  
        #plot.margin=unit(c(2,2,2,2),"mm")
) + 
geom_vline(xintercept =2100, color="black", size=0.2) + 
geom_vline(xintercept =median(data.subset$median.estimate,na.rm=TRUE), color="black", size=0.2) +
annotate("text", label = "True Value \n~$2,100", x = 2800, y = 0.9, size = 3, colour = "black", hjust=0) +
annotate("text", label = "Median Respondents' \nEstimate", x = 21000, y = 0.9, size = 3, colour = "black", hjust=0) +
 theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_rect(colour = "black", size=0.8, fill=NA))
#ggtitle("1A Respondents' Median Estimate")
plot.ecdf
ggsave(plot.ecdf, filename="tf_f_04_a_college_estimated_median_income.png", width=4,height=4)

####################################################################
# Plot of estimated versus actual rank
####################################################################

data.subset <- na.omit(data.subset)
median.rank.estimate <- median(data.subset$q3a)
median.rank.true <- median(data.subset$rank.global.true)
table(data.subset$rank.estimate.categorical)
data.subset$rank.estimate.categorical <- factor(data.subset$rank.estimate.categorical, labels=c("Overestimate"="Overestimate (1%)", "Correct +/- 10"="Correct \u00B1 10 (41%)", "Underestimate"="Underestimate (58%)"))

rankplot.1 <- ggplot(data.subset, aes(rank.global.true,q3a))
rankplot.2 <- rankplot.1 + 
#geom_abline(intercept = 0, slope = 1, color="black", size=0.2) +
geom_abline(intercept = 13, slope = 1, color="black", size=0.2, linetype="dashed") +
geom_abline(intercept = -13, slope = 1, color="black", size=0.2, linetype="dashed") +
geom_point(aes(colour=rank.estimate.categorical, shape=rank.estimate.categorical), position = position_jitter(w = 8.0, h = 2.5), size = 0.8) + 
scale_shape_manual(values=c(2,1,6)) +
coord_fixed(ratio=1) +  
ylab("Respondent's Estimate of Rank") + 
xlab("Respondent's True Rank in Income Distribution") + 
scale_x_continuous(breaks=c(40, 50, 60, 70, 80, 90, 100), labels=c("40","50", "60", "70","80" ,"90", "100"), limits=c(40, 100)) + scale_y_continuous(breaks=c(40, 50, 60, 70, 80, 90, 100), labels=c("40", "50", "60", "70","80" ,"90", "100"), limits=c(40, 100))  +
theme(plot.title = element_text(size = 11), 
axis.text=element_text(size=11), 
axis.title=element_text(size=11), 
legend.text=element_text(size=9),
 legend.title=element_blank(),
legend.position=c(0.25,0.85) ,
legend.key = element_blank()
#plot.margin=unit(c(2,2,2,2),"mm")
) +
 theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_rect(colour = "black", size=0.8, fill=NA)) 
rankplot.2
ggsave(plot.ecdf, filename="tf_f_04_b_college_estimated_median_income.png", width=4,height=4)
levels(data.subset$rank.estimate.categorical)
#multiplot(plot.ecdf, rankplot.2, cols=2)

####################################################################
# ECDF of bias
####################################################################

P = ecdf(data.subset$rank.estimate.diff)
P(0.0)
plot.ecdf <- ggplot(data.subset, aes(x=rank.estimate.diff)) + stat_ecdf(colour="royalblue4", size=0.8) + scale_x_continuous(breaks=c(-100, -80, -60,-40,-20,0,20,40,60,80,100), limits=c(-100, 100)) + scale_y_continuous(breaks=c(0,0.20, 0.40,0.60, 0.80, 1.00)) + xlab("Bias = Respondent's Estimate \n of Own Rank - True Rank") + ylab("Cumulative Probability") + theme(plot.title = element_text(size = 11), aspect.ratio=1, axis.text=element_text(size=11),
        axis.title=element_text(size=11),plot.title = element_text(size = 8)#,  
        #plot.margin=unit(c(2,2,2,2),"mm")
) + 
geom_vline(xintercept =0, color="black", size=0.2) + 
geom_vline(xintercept =median(data.subset$median.estimate,na.rm=TRUE), color="black", size=0.2) +
geom_hline(yintercept =P(-1), color="black", size=0.2, linetype="dashed") + # horizontal line separating overestimators
 theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_rect(colour = "black", size=0.8, fill=NA)) +
 annotate("text", label = "82% of Respondents \n Underestimate Rank \n (\u22651 Percentile)", x = -100, y = 0.75, size = 3, colour = "black", hjust=0) 
#ggtitle("1A Respondents' Median Estimate")
plot.ecdf
ggsave(plot.ecdf, filename="tf_f_04_c_college_estimated_rank_bias.png", width=4,height=4)
